def extract_feature_by_kmeans(class_num, subsample_size, window_size, cluster_num, max_iter, rnd_number) :
	file_name = gen_feature_fname(class_num, subsample_size, window_size, cluster_num)
	start = time.time()
	train_X, test_X, train_y, test_y = FetchFile.gen_data(class_num, subsample_size, window_size, rnd_number)
	print("Generate Data Time: ",time.time()-start)
	features = learnvocabulary(train_X, cluster_num, max_iter)
	features = mynormalize_multi(features)
	write_features(file_name, features)
	print("=== Feature Extraction Finish ===")
def grid_search_for_neighbor_multiprocess(class_num, subsample_size, window_size, cluster_num, max_iter, rnd_number, neighbor_num_seq):
	#Classification.classifiy(class_num, subsample_size, window_size, cluster_num, max_iter, rnd_number, neighbor_num)
	train_X, test_X, train_y, test_y = FetchFile.gen_data(class_num, subsample_size, window_size, rnd_number)
	jobs = []
	#for neighbor_num in [2**i for i in range(neighbor_log2_num)]:
	for neighbor_num in neighbor_num_seq:
		p = multiprocessing.Process(target=Classification.classifiy, args=(class_num, subsample_size, window_size, cluster_num, \
			max_iter, rnd_number, neighbor_num, train_X, train_y, test_X, test_y))
		jobs.append(p)
		p.start()
	# end for
	print(jobs)
예제 #3
0
        centroid.append(get_feature.read_features(centroid_file))
    return centroid


if __name__ == "__main__":

    rnd_number    = 8131985
    class_num = __CLASS_NUM__
    subsample_size = 92
    window_size = __WINDOW_SIZE__
    cluster_num = [__CLUSTER_NUM__]

###################################################
# permutation data and split into train and test #
# split our data into half of train data and half of text data randomly.
train_X, test_X, train_y, test_y = FetchFile.gen_file(class_num, subsample_size, window_size, rnd_number)
##  USAGE :  gen_file(class_num, subsample_size, window_size, rnd_number)  ###  


print(train_y)
print(test_y)

###################################################
#Start to get the features learned from feed in data

centroid = got_feature(class_num, subsample_size, window_size, cluster_num)

print(centroid[0])
print(len(centroid[0]))

###################################################
def grid_search_for_neighbor(class_num, subsample_size, window_size, cluster_num, max_iter, rnd_number, neighbor_num_seq):
	train_X, test_X, train_y, test_y = FetchFile.gen_data(class_num, subsample_size, window_size, rnd_number)
	#for neighbor_num in [2**i for i in range(neighbor_log2_num)]:
	for neighbor_num in neighbor_num_seq:
		Classification.classifiy(class_num, subsample_size, window_size, cluster_num, max_iter, rnd_number, neighbor_num, train_X, train_y, test_X, test_y)
예제 #5
0
date_str = str(cur_date.month).zfill(2) + str(cur_date.day).zfill(2)

if (len(sys.argv) == 2) and (sys.argv[1].startswith('--debugdate:')):
    date_str = sys.argv[1][12:]

print 'Analysis target date = ',date_str

#下载当前日期的配置文件
config_file_name = download_path[mode] + date_str + conf_file_suff
config_file_url = fixed_conf_url + date_str + conf_file_suff
check_file_name = download_path[mode] + date_str + check_file_suff
#先判断有无标记文件,避免重复下载
if os.path.isfile(check_file_name):
    print 'No contest today.'
    exit()
FF.download_url_to_file(config_file_url, config_file_name)
#检测配置文件是否正确,如果不正确=没有比赛,直接退出
#如果是非法的配置文件,不需要转码,本身就是utf-8
if AD.legal_config_file(config_file_name) == no_contest:
    debug_info_file = open(debug_file_name[mode], 'w')
    debug_info_file.writelines('No contest')
    debug_info_file.close()
    chk_file = open(check_file_name, 'w')
    chk_file.write('No contest')
    chk_file.close()
    print 'No contest'
    exit()
FF.file_convert_pagecode(config_file_name, source_pagecode, config_file_name, dest_pagecode)

AD.load_config_file(config_file_name)