def map_word_list(read_directory1, read_directory2, write_filename): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) result = [] for i in range(file_number): word_list = [] f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: word_list.append(line.strip()) line = f.readline() f.close() vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def get_final_center(read_filename1, read_filename2, write_filename): result = [] word_list = [] get_text_to_single_list(word_list, read_filename2) vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def map_word_list(read_filename1, read_filename2, write_filename): word_list = [] f = open(read_filename2, 'rb') line = f.readline() while line: word_list.append(line.strip().split(',')[0]) line = f.readline() f.close() word_result = [] vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: word_result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(word_result, write_filename)
def sample_real_center(read_filename1, read_filename2, write_filename): result = [] word_list = [] f = open(read_filename2) line = f.readline() while line: word_list.append(line.strip().split()[0]) line = f.readline() f.close() word_list = word_list[0:1000] vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def map_word_list(read_directory1, read_filename2, write_filename): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) word_list = [] f = open(read_filename2, 'rb') line = f.readline() while line: word_list.append(line.strip()) line = f.readline() f.close() result = [] for i in range(file_number): vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def merge_all_center(read_directory1, read_directory2, start_batch, end_batch): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) new_word_list = [] for i in range(start_batch, end_batch): word_list = [] f1 = open(read_directory2 + '/' + str(i) + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i) + '.txt') if len(center) >= 200: center = np.array([center]) for each in center: word_result = reflect_vsm_to_wordlist(each, word_list) for word in set(word_result).difference(new_word_list): new_word_list.append(word) result = [] all_count = 0 for i in range(start_batch, end_batch): word_list = [] f1 = open(read_directory2 + '/' + str(i) + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i) + '.txt') if len(center) >= 200: center = np.array([center]) this_result = [] for each in center: tf_dict = {} for k in range(len(each)): if each[k] > 0.000001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in new_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = np.zeros(len(new_word_list)) count = 0 for key in new_word_list: this_line[count] = tf_dict2[key] count += 1 #每一行合并为字符串,方便写入 this_result.append(this_line) all_count += 1 result.append(this_result) #quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') return result, new_word_list, all_count
def merge_all_center(read_directory1, read_directory2, read_filename, write_directory, write_filename): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) new_word_list = [] batch_index = [] f = open(read_filename) line = f.readline() while line: batch_index.append(line.split()[0]) line = f.readline() f.close() for i in range(file_number): word_list = [] f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T for each in center: word_result = reflect_vsm_to_wordlist(each, word_list) for word in set(word_result).difference(new_word_list): new_word_list.append(word) for i in range(file_number): word_list = [] f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T #tf_dict = {} result = [] for each in center: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in new_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in new_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') quick_write_list_to_text(new_word_list, write_filename)
def merge_all_center(read_directory1, read_directory2, read_filename, write_directory, write_filename): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) new_word_list = [] batch_index = [] f = open(read_filename) line = f.readline() while line: batch_index.append(line.split()[0]) line = f.readline() f.close() for i in range(file_number): word_list = [] f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T for each in center: word_result = reflect_vsm_to_wordlist(each, word_list) for word in set(word_result).difference(new_word_list): new_word_list.append(word) for i in range(file_number): word_list = [] f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T #tf_dict = {} result = [] for each in center: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in new_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in new_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') quick_write_list_to_text(new_word_list, write_filename)