def annealingSimulation(data_s,data_l,max_nodes,min_nodes,temperature=10000.0,cool=0.15,step=10): #generate a random number to start if min_nodes<0: min_nodes=0 initial_random_number = random.randint(min_nodes,max_nodes) nodeA = initial_random_number while temperature>0.1: direction = random.randint(-step,step) nodeB = nodeA+direction if nodeB>max_nodes: nodeB = max_nodes elif nodeB<min_nodes: nodeB = min_nodes #start to do the cross validation for number of nodeA percentage = 0.05 size_training_set = len(data_s) total_num_loop = 1 x = 0 total_num = 0 total_error = 0 while x<total_num_loop: x=x+1 #start to create net classifier print 'start to create NN' print ('Hidden nodeA are %d' %(nodeA)) [sliced_training_data,sliced_testing_data,sliced_training_label,sliced_testing_label]=helper_cross_validation.spilteDataAndLabel(data_s,data_l,x,percentage,step=len(data_s)) [net,number_of_input_features,tfidf_vectorizer] = creatingNeuralNetwork(nodeA,sliced_training_data,sliced_training_label) #start to testing indata = np.zeros((len(sliced_testing_data),number_of_input_features)) for j,sentence in enumerate(sliced_testing_data): indata[j,:] = tfidf_vectorizer.transform([sentence]).toarray()[0] #testing the result results = net(indata) results_list = [] for r in results: if r<0.5: r=0 elif r>=0.5 and r <1.5: r=1 elif r>=1.5: r=2 results_list.append(r) total_num = total_num+len(results_list) index = 0 while index < len(results_list): if results_list[index]!=sliced_testing_label[index]: total_error = total_error+1 index = index+1 error_rateA = float(total_error)/float(total_num) #cross validation for nodeB percentage = 0.02 size_training_set = len(data_s) #too much time occupied total_num_loop = 1 x = 0 total_num = 0 total_error = 0 while x<total_num_loop: x=x+1 #start to create net classifier [sliced_training_data,sliced_testing_data,sliced_training_label,sliced_testing_label]=helper_cross_validation.spilteDataAndLabel(data_s,data_l,x,percentage,step=len(data_s)) print 'start to create NN' print ('Hidden nodeB are %d' %(nodeB)) [net,number_of_input_features,tfidf_vectorizer] = creatingNeuralNetwork(nodeB,sliced_training_data,sliced_training_label) #start to testing indata = np.zeros((len(sliced_testing_data),number_of_input_features)) for j,sentence in enumerate(sliced_testing_data): indata[j,:] = tfidf_vectorizer.transform([sentence]).toarray()[0] #testing the result results = net(indata) results_list = [] for r in results: if r<0.5: r=0 elif r>=0.5 and r <1.5: r=1 elif r>=1.5 and r<2.5: r=2 elif r>=2.5 and r<3.5: r=3 elif r>=3.5 and r<4.5: r=4 else: r=5 results_list.append(r) total_num = total_num+len(results_list) index = 0 while index < len(results_list): if results_list[index]!=sliced_testing_label[index]: total_error = total_error+1 index = index+1 error_rateB = float(total_error)/float(total_num) p=pow(math.e,(-error_rateB-error_rateA)/temperature) if (error_rateB<error_rateA or random.random()<p): nodeA=nodeB print ("Node number: %d, accurancy: %f" %(nodeB,1-error_rateB)) else: print ("Node number: %d, accurancy: %f" %(nodeA,1-error_rateA)) #cooling a little bit temperature=temperature*cool return nodeA
total_testing_times = 0 error_testing_times = 0 size_training_set = len(training_set) total_num_loop = int(size_training_set / step) x = 0 # print ("There are %d loops" % (total_num_loop)) while x < total_num_loop: x = x + 1 # print ("This is %d loop" % (x)) # start to slice array and train classifer [ sliced_training_data, sliced_testing_data, sliced_training_label, sliced_testing_label, ] = helper_cross_validation.spilteDataAndLabel(training_set, label_set, x, percentage, step=step) training_feature = tfidf.fit_transform(sliced_training_data) classifier.fit(training_feature, sliced_training_label) # start to testing indexOfTest = 0 testing_feature = tfidf.transform(sliced_testing_data) test_result = classifier.predict(testing_feature) while indexOfTest < len(sliced_testing_data): total_testing_times = total_testing_times + 1 if test_result[indexOfTest] != sliced_testing_label[indexOfTest]: error_testing_times = error_testing_times + 1 indexOfTest = indexOfTest + 1 successRatio = 1 - float(error_testing_times) / float(total_testing_times) print ("Success Ratio is %f" % (successRatio))
while n<max_ngram: n=n+1 print ("Max ngram is %d" % (n)) classifier = MultinomialNB() tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.9, stop_words=None, token_pattern=pattern, ngram_range=(1, n)) total_testing_times = 0; error_testing_times = 0; size_training_set = len(training_set) total_num_loop = int(size_training_set/step) x = 0 # print ("There are %d loops" % (total_num_loop)) while x<total_num_loop: x = x+1 # print ("This is %d loop" % (x)) #start to slice array and train classifer [sliced_training_data,sliced_testing_data,sliced_training_label,sliced_testing_label]=helper_cross_validation.spilteDataAndLabel( training_set,label_set,x,percentage,step=step) training_feature = tfidf.fit_transform(sliced_training_data) classifier.fit(training_feature, sliced_training_label) #start to testing indexOfTest = 0 testing_feature = tfidf.transform(sliced_testing_data) test_result = classifier.predict(testing_feature) while indexOfTest < len(sliced_testing_data): total_testing_times = total_testing_times+1 if test_result[indexOfTest]!=sliced_testing_label[indexOfTest]: error_testing_times = error_testing_times+1 indexOfTest=indexOfTest+1 successRatio =1-float(error_testing_times)/float(total_testing_times) print ("Success Ratio is %f" % (successRatio))