def binaryDecision(train_fn, test_fn, window, isALS, embedding_path, isSave=False): # train data, sent (<b> </b> marked preposition) # train label: 1 - should be corrected, 0 - not corrected orig_prep_list, sent_list, correct_prep_list = readData(train_fn) train_label = [int(orig_prep_list[ind]!=correct_prep_list[ind]) for ind in range(len(sent_list))] train_feature = binarySelectFeatures(sent_list, window, orig_prep_list, isALS, embedding_path) # test data orig_prep_list, sent_list, correct_prep_list = readData(test_fn) gold_label = [int(orig_prep_list[ind]!=correct_prep_list[ind]) for ind in range(len(sent_list))] test_feature = binarySelectFeatures(sent_list, window, orig_prep_list, isALS, embedding_path) # DecisionTree classifier test_label = TreeClf(train_feature, train_label, test_feature) if (isSave): with open("model/binary_decision", "wb") as handle: pickle.dump(test_label, handle) print "done dumping binary labels..." # binary evaluation precision = precision_score(gold_label, test_label) recall = recall_score(gold_label, test_label) fscore = f1_score(gold_label, test_label) print "selected instances:", np.sum(test_label) print "prec: %f, recall: %f, fscore: %f" % (precision, recall, fscore)
def processData(self, data, connection): print "Processing " + data try: if data: # Make sure the data was received properly request = util.readData(data) # Initialize a response container print request if request['action'] == 'LOGIN': # If we've found the login tag... return self.login(request, connection) elif request['action'] == 'CREATE_ACCOUNT': return self.createAccount(request) elif request['action'] == "UPDATE_CHAT": return self.updateChat(request) elif request['action'] == "ADD_CHAT_USER": return self.addUserToChat(request) elif request['action'] == "REMOVE_CHAT_USER": return self.removeUserFromChat(request) elif request['action'] == "LIST_CHATS": return self.listChats(request) elif request['action'] == "LIST_CHAT_CONTENTS": return self.listChatContents(request) elif request['action'] == "ADD_CHAT": return self.makeChat(request) elif request['action'] == "REMOVE_CHAT": return self.removeChat(request) else: # We didn't recognize this query... clientResponse = util.makeResponse(request['action'], "FAILURE", { "info" : "ACTION UNDEFINED" }, "") util.printInfo(clientResponse) return clientResponse return util.makeResponse("NO_DATA_RECIEVED", "FAILURE", { "info" : "No data received" }, "") except Exception as e: return util.makeResponse("CRASH", "FAILURE", { "info" : str(e) }, "")
def __init__(self,data_set): data, NClass, dictL2I, dictI2L = util.readData("../data/" + data_set + '.csv') pData = 0.7 # proportion of training data N = int(pData * len(data)) random.shuffle(data) self.trainingData = data[:N] self.testData = data[N:]
def run(args): # create output folder if not os.path.isdir(args.o): os.makedirs(args.o) print("Create output dir: %s" % args.o) else: sys.exit("Error: Output dir %s already exits." % args.o) util.print_log(time.ctime() + " Start running.", args.o + "/log.txt") # Print args(). util.print_log("Input 1D signal file: %s" % (args.i), args.o + "/log.txt") util.print_log("Input Hi-C file: %s" % (args.hic), args.o + "/log.txt") util.print_log("Input bin size file: %s" % (args.g), args.o + "/log.txt") util.print_log("Resolution: %s nt" % (args.w), args.o + "/log.txt") util.print_log("Number of states: %s " % (args.n), args.o + "/log.txt") util.print_log("Number of cores to use: %s" % (args.p), args.o + "/log.txt") # Read data bin_data = util.readBedGraph(args.g) input_data = util.readData(args.i) util.print_log(time.ctime() + " Finished reading input.", args.o + "/log.txt") # Create Hi-C Matrix (n, d) = input_data.shape edges = util.create_hic_matrix(args.hic, n) util.print_log(time.ctime() + " Finished creating edges.", args.o + "/log.txt") # Create graph object hmrf = mrf.MarkovRandomField(n=n, edges=edges, obs=input_data,args=args) # initialization hmrf.init_gmm() print(hmrf.label) util.print_log(time.ctime() + " Init GMM.", args.o + "/log.txt") hmrf.init_trans() util.print_log(time.ctime() + " Init trans matrix.", args.o + "/log.txt") print(hmrf.edge_potential) # inference of states hmrf.solve() # Save file np.savetxt(args.o + "/state_" + str(hmrf.n_state), hmrf.label, delimiter='\n', fmt='%d') if args.save: util.print_log(time.ctime() + " Save model.", args.o + "/log.txt") util.save_variable(hmrf, args.o + "/model.pkl")
def main(): parser = argparse.ArgumentParser(description='Robot Tracker') parser.add_argument('input_filename', type=str, help='input file name') parser.add_argument('-o', '-output_filename', type=str, default='prediction.txt', help='output file name') parser.add_argument('-v', '--visualize', type=bool, default=False, help='whether to visualize output') parser.add_argument('-t', '--test', type=bool, default=False, help='use the last 2 seconds of the file as a test and outputs predicted error') args = parser.parse_args() data = readData(args.input_filename) trainingData = data testData = None if args.test: # If in testing mode, we want to use the last 60 frames as test data trainingData = data[:-60] testData = data[len(data)-60:] predictions = predict(trainingData,args.visualize) assert len(predictions) == 60 with open(args.o, 'w') as f: for element in predictions: f.write("%d,%d\n" % (element[0], element[1]))
def run(data_set, size, gen_num, times): ''' :param data_set: data_set :param size: population size :param gen_num: generation number for one time :param times: total times of generation ''' data, NClass, dictL2I, dictI2L = util.readData("./data/" + data_set + '.csv') pData = 0.7 # proportion of training data N = int(pData * len(data)) random.shuffle(data) trainingData = data[:N] testData = data[N:] accuracy = 0 population = [] while len(population) < size: # use_data = random.randint(1, 15) use_data = 20 init_trainingData_idx = random.sample(range(0, len(trainingData)), use_data) init_trainingData = [] for idx in init_trainingData_idx: init_trainingData.append(trainingData[idx]) RS = fuzzyRule.rule_set(init_trainingData) if len(RS.rules) > 0: RS.getFitness(trainingData) population.append(RS) # for RS in population: # print(str(RS.fitness) + ' ' + str(40 - RS.fitness2)) time_start = time.time() for i in range(times): # p = [0.9, 0.25, 0.5, 0.9, 0.25] p = [0.9, 0.25, 0.5, 0.9, 0.25] constant = [1] print("start") pareto_set, population = algorithm.NSGAII(population=population, p=p, gen_num=gen_num, constant=constant, size=size, trainingData=trainingData) time_end = time.time() time_cost = time_end - time_start time_info = "time cost: " + str( time_cost) + '\r' + "time each gen: " + str(time_cost / gen_num * (i + 1)) RS_info = '' print(time_info) print() print('Result') shown = set() for RS in pareto_set: if RS.fitness2 in shown: pass else: shown.add(RS.fitness2) RS_before = "Before refit: " + str( RS.fitness) + ' ' + str(40 - RS.fitness2) + ' ' + str( RS.correct_num) print(RS_before) RS.getFitness(testData) RS_after = "After refit: " + str( RS.fitness) + ' ' + str(40 - RS.fitness2) + ' ' + str( RS.correct_num) print(RS_after) RS_info += RS_before + '\r' + RS_after + '\r\n' RS.getFitness(trainingData) result_print = time_info + '\r\nResult\r\n' + RS_info path = './运行结果/' + data_set + '/result data/' exist_result = [int(x[:-4].split(' ')[0]) for x in os.listdir(path)] last_result = max(exist_result) if exist_result else 0 write_as = path + '{0} c {1} g {2} s {3} e {4}.txt'.format( last_result + 1, 1, (i + 1) * gen_num, size, 0) with open(write_as, 'w') as f: f.write(result_print)
# output: theta_d in Eq 5 return torch.sigmoid(z) # we could let z go through some neural layers def forward(self, sentence): # input: sentence is the sentence of one document # ouput: theta, mu, log_sigma mu, log_sigma, h_list, q_z = self.encode(sentence) z = self.reparameterize(mu, log_sigma) return self.decode(z), mu, log_sigma, h_list, q_z def loadEmb(): return dict() # TODO data = util.readData('../data/featureTest.txt') A = dict() phi = torch.randn(D, D, requires_grad=True) emb_vector = loadEmb() # TODO def loss_func(K, sigma_1, sigma_0, mu_1, mu_0, theta, h_list, q_z, sentence): # loss function for one document # Eq 8 inv_sigma_1 = torch.inverse(torch.diag(sigma_1)) mu_1_2 = mu_1 - mu_0 # 1st result = 0.5 * (torch.trace(torch.mul(inv_sigma_1, sigma_0)) + \ torch.mul(torch.mul(torch.t(mu_1_2), inv_sigma_1), mu_1_2) - K +\ math.log(float(torch.prod(sigma_1))/ float(torch.prod(sigma_0))))
def main(): # Read in Data data = readData("nba_stats.csv") # Randomizes the data X = randomize(data) Y = X[:,-1] # Only the last column X = X[:,:-1] # All but the last column D = len(X[0]) # Standardize standardized = standardize(X) # Select first 2/3 for training index = int(math.ceil((2.0/3.0) * len(X))) training = standardized[:index+1] testing = standardized[index+1:] Y_testing = Y[index+1:] # Divide training data into two groups positive = [] negative = [] for i in range(0, len(training)): if Y[i] == 1: # spam positive.append(training[i]) else: negative.append(training[i]) positive = numpy.array(positive).astype(float) negative = numpy.array(negative).astype(float) # Compute models for spam positive_model = [] for k in range(0, D): positive_model.append((numpy.mean(positive[:,k]), numpy.std(positive[:,k]))) # Compute models for non-spam negative_model = [] for k in range(0,D): negative_model.append((numpy.mean(negative[:, k]), numpy.std(negative[:, k]))) # Classify testing samples result = [] testing_probabilities = [] for sample in testing: p_positive = float(len(positive)) / len(positive) + len(negative) p_negative = float(len(negative)) / len(positive) + len(negative) for k in range(0, D): p_positive *= likelihood(positive_model[k][0], positive_model[k][1], sample[k]) p_negative *= likelihood(negative_model[k][0], negative_model[k][1], sample[k]) testing_probabilities.append(normalize_probabilities([p_positive, p_negative])) if p_positive > p_negative: result.append(1) else: result.append(0) precisions = [] recalls = [] for threshold in range(0, 100, 5): threshold = float(threshold) / 100 TruePositives = 0.0 TrueNegatives = 0.0 FalsePositives = 0.0 FalseNegatives = 0.0 for i in range(0, len(testing_probabilities)): if Y_testing[i] == 1: # Positive example if testing_probabilities[i][0] > threshold: # Predicted positive TruePositives += 1 else: # Predicted negative FalseNegatives += 1 elif Y_testing[i] == 0: # Negative example if testing_probabilities[i][0] > threshold: # Predicted positive FalsePositives += 1 else: # Predicted negative TrueNegatives += 1 try: precision = TruePositives / (TruePositives + FalsePositives) except ZeroDivisionError: if TruePositives == 0: precision = 1 else: precision = 0 try: recall = TruePositives / (TruePositives + FalseNegatives) except ZeroDivisionError: if TruePositives == 0: recall = 1 else: recall = 0 precisions.append(precision) recalls.append(recall) plt.plot(recalls, precisions, 'r-o') plt.xlabel('Recall') plt.ylabel('Precision') plt.show()
# CS4375 Machine Learning Project # arg1 trainingfile # arg2 testfile # arg3 attributefile (may be hardcoded instead) import numpy as np from sklearn.naive_bayes import GaussianNB import util import sys # instantiate args args = sys.argv # load training data (trainingdata, trainingclasses) = util.readData(args[1]) trainingdata = np.array(trainingdata) trainingclasses = np.array(trainingclasses) # load test data (testclasses not used) (testdata, testclasses) = util.readData(args[2]) classifier = GaussianNB() classifier.fit(trainingdata, trainingclasses) predictedVals = classifier.predict(trainingdata) correct = 0 x = 0 for val in predictedVals: if val == trainingclasses[x]: correct += 1 x += 1 print((correct / len(trainingdata)) * 100) # attributes = util.readAttributes("data/attr.txt")
with tf.Session() as sess: merged_summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter("/home/cxr/tfboard/autocoderlstm", sess.graph) sess.run(tf.global_variables_initializer()) start = global_step.eval() step = 0 r = ReadData.Actionreader() ckpt_dir = "/home/cxr/BvhLstm1-2" filename = "/home/cxr/7-2" if Need_to_restore: if restore(ckpt_dir+"/"): print "restore_seccessfully" if not Use_to_train: r.reset() v,timelist=utl.readData(filename) length = len(v) i=0 step = 0 batch_xs, batch_ys = utl.get_batch(v, i, length, classnumber=classnum, batchsize=batch_size, n_sequence=n_sequence) print len(batch_xs) while batch_xs and step<=2000: pre = sess.run([predic], feed_dict={ x: batch_xs, }) r.out_data(pre[0],ckpt_dir) #batch_xs = batch_xs[0] #batch_xs = batch_xs[1:] #batch_xs.append(utl.transform(pre,classnum))
# CS4375 Machine Learning Project # arg1 trainingfile # arg2 testfile # arg3 attributefile (may be hardcoded instead) import numpy as np from sklearn.naive_bayes import GaussianNB import util import sys # instantiate args args = sys.argv # load training data (trainingdata, trainingclasses) = util.readData(args[1]) trainingdata = np.array(trainingdata) trainingclasses = np.array(trainingclasses) # load test data (testclasses not used) (testdata, testclasses) = util.readData(args[2]) classifier = GaussianNB() classifier.fit(trainingdata, trainingclasses) predictedVals = classifier.predict(trainingdata) correct = 0 x = 0 for val in predictedVals: if val == trainingclasses[x]: correct += 1 x += 1 print((correct/len(trainingdata)) * 100) # attributes = util.readAttributes("data/attr.txt")
def dumpMultiFeatures(train_fn, test_fn, window, win_size_list, isALS, embedding_path): train_orig_prep_list, train_sent_list, train_correct_prep_list = readData( train_fn) test_orig_prep_list, test_sent_list, test_correct_prep_list = readData( test_fn) # binary decision prep_list = getPrepList() prep_num = len(prep_list) with open("model/binary_decision", "rb") as handle: binary_test_label = pickle.load(handle) total_valid_corrections = np.sum([ int(test_orig_prep_list[ind] != test_correct_prep_list[ind]) for ind in range(len(test_correct_prep_list)) ]) raw_train_features = getRawMultiFeatures(train_sent_list, win_size_list, isALS, embedding_path) raw_test_features = getRawMultiFeatures(test_sent_list, win_size_list, isALS, embedding_path) # select test examples to be corrected selected_test_inds = [ ind for ind in range(len(binary_test_label)) if binary_test_label[ind] == 1 ] print "selected test instances:", len(selected_test_inds) test_prep_scores_win, test_side_prep_scores_win, test_prep_ranks, test_conf_mat = raw_test_features test_prep_scores_win = test_prep_scores_win[:, selected_test_inds, :] test_side_prep_scores_win = test_side_prep_scores_win[:, selected_test_inds, :] test_prep_ranks = [test_prep_ranks[ind] for ind in selected_test_inds] raw_test_features = (test_prep_scores_win, test_side_prep_scores_win, test_prep_ranks, test_conf_mat) test_orig_prep_list = [ test_orig_prep_list[ind] for ind in selected_test_inds ] test_correct_prep_list = [ test_correct_prep_list[ind] for ind in selected_test_inds ] # preposition selection from multiple candidates print "using corpus bigram count as feature..." train_features, train_labels = multiSelectTrainFeatures( prep_list, raw_train_features, train_orig_prep_list, train_correct_prep_list) test_features, test_candidate_prep_inds = multiSelectTestFeatures( prep_list, raw_test_features, test_orig_prep_list) # dump features: train_features, train_labels, test_features, test_candidate_prep_inds model_folder = "model/" with open( model_folder + "train.feature_label_win=" + str(len(win_size_list)), "wb") as handle: pickle.dump((train_features, train_labels), handle) # dump test data test_sent_list = [test_sent_list[ind] for ind in selected_test_inds] with open(model_folder + "test.data_win=" + str(len(win_size_list)), "wb") as handle: pickle.dump( (test_sent_list, test_orig_prep_list, test_correct_prep_list), handle) with open( model_folder + "test.feature_label_win=" + str(len(win_size_list)), "wb") as handle: pickle.dump((test_features, test_candidate_prep_inds), handle) print "done dumping train and test data..." return total_valid_corrections
loss = tf.reduce_mean(tf.pow(pred - label, 2)) train_op = tf.train.AdamOptimizer().minimize(loss) global_step = tf.Variable(0, name='global_step', trainable=False) saver = tf.train.Saver() epochs = 20000 with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) start = global_step.eval() ckpt_dir = "/home/cxr/BvhLstm1-2" filename = "/home/cxr/7-2" for epoch in range(epochs): print "tarining Epochs = ", epoch r = ReadData.Actionreader() v, _ = utl.readData(filename) length = len(v) i = 0 step = 0 batch_xs, batch_ys = utl.get_batch(i, v, sequenceLength, batch_size) while batch_xs != None and batch_ys != None: _, losss = sess.run( [train_op, loss], feed_dict={ encoder_inputs_raw: batch_xs, decoder_targets_raw: batch_ys, decoder_inputs_raw: batch_ys, }) if step % 20 == 0:
# print(sum((self.y - self.output) ** 2)) # print(((self.y-self.output)**2)) # print(sum(sum((self.y - self.output) ** 2)) / 4) # print(self.y) # print("####") # print(self.output) # print("%%%") # print(self.y, self.output) self.loss.append(sum((self.y - self.output) ** 2)) if __name__ == "__main__": # X the array of inputs, y the array of outputs, 4 pairs in total noTrainExamples, noFeatures, noOutputs, inTrainData, outTrainData = readData("slump_test.data") noTestExamples, noFeatures, noOutputs, inTestData, outTestData = readData("slump_test.data") normaliseData(noTrainExamples, noFeatures, inTrainData, noTestExamples, inTestData) normaliseData2(noTrainExamples, 1, outTrainData, noTestExamples, outTestData) # for i in range(len(inTrainData)): # print(inTrainData[i]) # print(outTrainData[i]) inTrainData = np.array(inTrainData) outTrainData = np.array(outTrainData) nn = NeuralNetwork(inTrainData, outTrainData) nn.loss = [] iterations = [] for i in range(10000):
def main(): # Read in Data data = readData("spambase.data") # Randomizes the data X = randomize(data) Y = X[:, -1] # Only the last column X = X[:, :-1] # All but the last column D = len(X[0]) # Standardize standardized = standardize(X) # Select first 2/3 for training index = int(math.ceil((2.0 / 3.0) * len(X))) training = standardized[:index + 1] testing = standardized[index + 1:] Y_testing = Y[index + 1:] # Divide training data into two groups positive = [] negative = [] for i in range(0, len(training)): if Y[i] == 1: # spam positive.append(training[i]) else: negative.append(training[i]) positive = numpy.array(positive).astype(float) negative = numpy.array(negative).astype(float) # Compute models for spam positive_model = [] for k in range(0, D): positive_model.append( (numpy.mean(positive[:, k]), numpy.std(positive[:, k]))) # Compute models for non-spam negative_model = [] for k in range(0, D): negative_model.append( (numpy.mean(negative[:, k]), numpy.std(negative[:, k]))) # Classify testing samples result = [] for sample in testing: p_positive = float(len(positive)) / len(positive) + len(negative) p_negative = float(len(negative)) / len(positive) + len(negative) for k in range(0, D): p_positive *= likelihood(positive_model[k][0], positive_model[k][1], sample[k]) p_negative *= likelihood(negative_model[k][0], negative_model[k][1], sample[k]) if p_positive > p_negative: result.append(1) else: result.append(0) # Compute statistics TruePositives = 0.0 TrueNegatives = 0.0 FalsePositives = 0.0 FalseNegatives = 0.0 for i in range(0, len(result)): if Y_testing[i] == 1: # Positive example if result[i] == 1: # Predicted positive TruePositives += 1 elif result[i] == 0: # Predicted negative FalseNegatives += 1 elif Y_testing[i] == 0: # Negative example if result[i] == 1: # Predicted positive FalsePositives += 1 elif result[i] == 0: # Predicted negative TrueNegatives += 1 try: precision = TruePositives / (TruePositives + FalsePositives) recall = TruePositives / (TruePositives + FalseNegatives) f_measure = (2 * precision * recall) / (precision + recall) accuracy = (TruePositives + TrueNegatives) / ( TruePositives + TrueNegatives + FalsePositives + FalseNegatives) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F-measure: ' + str(f_measure) print 'Accuracy: ' + str(accuracy) except: pass
# MULTI_CLASSIFICATION: 5, 8 outputColumn = outputColumnVAR inputColumns = inputColumnsVAR networkType = NetworkType.REGRESSION numberOfNeurons = 2 * numpy.size(inputColumns) + 1 dropout = 0.5 # TODO : numberOfLayers = 3 epochs = 150 validationSplit = 0.25 phaseStart = 0 phaseEnd = 4659 data = util.readData(trainDataFile) ##################### # ----- START ----- # ##################### # define input input = data[:, inputColumnsVAR] # util.plotChart('input[' + str(phaseStart) + ':' + str(phaseEnd) + ',0]', input[phaseStart:phaseEnd, 0]) # define output output = data[:, outputColumn] # extracts the fourth column # util.plotChart('output', output[phaseStart:phaseEnd]) if networkType == NetworkType.REGRESSION: