def data_sample(read_directory, write_directory1, write_directory2): ''' :param read_directory: :param write_directory1: :param write_directory2: ''' file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory)]) sample_size = 250 sample_time = [] ratio = [] for i in range(file_number): vsm_matrix = get_text_to_nparray( read_directory + '/' + str(i + 1) + '.txt', 'int') vsm_matrix = vsm_matrix.T print 'Batch: %d' % (i + 1) start = time.clock() data_dimension = vsm_matrix.shape[0] Q = np.zeros((sample_size, data_dimension)) for k in range(Q.shape[0]): for j in range(Q.shape[1]): Q[k, j] = random.gauss( 1, np.sqrt(np.true_divide(1, np.sqrt(sample_size)))) sample_result = np.dot(Q, vsm_matrix) this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0 ratio.append(str(this_ratio)) interval = time.clock() - start print 'Time: %f' % interval sample_time.append(str(interval)) write_result = [] for each in sample_result: write_result.append(" ".join([str(x) for x in each])) quick_write_list_to_text(write_result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(sample_time, write_directory2 + '/sample_time.txt') quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
def data_sample(read_directory, write_directory1, write_directory2): ''' :param read_directory: :param write_directory1: :param write_directory2: ''' file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory)]) sample_size = 250 sample_time = [] ratio = [] for i in range(file_number): vsm_matrix = get_text_to_nparray(read_directory + '/' + str(i + 1) + '.txt', 'int') vsm_matrix = vsm_matrix.T print 'Batch: %d' % (i + 1) start = time.clock() data_dimension = vsm_matrix.shape[0] Q = np.zeros((sample_size, data_dimension)) for k in range(Q.shape[0]): for j in range(Q.shape[1]): Q[k, j] = random.gauss(1, np.sqrt(np.true_divide(1, np.sqrt(sample_size)))) sample_result = np.dot(Q, vsm_matrix) this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0 ratio.append(str(this_ratio)) interval = time.clock() - start print 'Time: %f' % interval sample_time.append(str(interval)) write_result = [] for each in sample_result: write_result.append(" ".join([str(x) for x in each])) quick_write_list_to_text(write_result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(sample_time, write_directory2 + '/sample_time.txt') quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2): all_batch_index = [] f = open(read_filename) line = f.readline() while line: all_batch_index.append(line.split()) line = f.readline() f.close() for i in range(len(all_batch_index)): this_word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.strip()) line = f1.readline() f1.close() result = [] result_id_time = [] for j in range(len(all_batch_index[i])): word_list = [] f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb') line = f2.readline() while line: word_list.append(line.split()[0]) line = f2.readline() f2.close() vsm_nparray = get_text_to_nparray( read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int') id_time = [] get_text_to_single_list( id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt') for each2 in id_time: result_id_time.append(each2) for each in vsm_nparray: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in this_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in this_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2): all_batch_index = [] f = open(read_filename) line = f.readline() while line: all_batch_index.append(line.split()) line = f.readline() f.close() for i in range(len(all_batch_index)): this_word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.strip()) line = f1.readline() f1.close() result = [] result_id_time = [] for j in range(len(all_batch_index[i])): word_list = [] f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb') line = f2.readline() while line: word_list.append(line.split()[0]) line = f2.readline() f2.close() vsm_nparray = get_text_to_nparray(read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int') id_time = [] get_text_to_single_list(id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt') for each2 in id_time: result_id_time.append(each2) for each in vsm_nparray: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in this_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in this_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')