def create_file_analysis_all_group(pre_path): print('---------create_file_analysis_all_group---------') f_all_group = read.line_in_file_rstrip(path_file=pre_path + 'all_group.txt') list_name_file = read.name_file_in_folder(path_folfer=pre_path + 'divide_groups_in_label') list_file_search = [] for name_file in list_name_file: list_data = read.get_list_obj_in_file_json( path_file=pre_path + 'divide_groups_in_label/' + name_file) list_file_search.append(list_data) matrix = [] for group in f_all_group: list_find_times = [] for data_file in list_file_search: result_find = lib.find_obj_by_field(L=data_file, target=group, field="group_id") if result_find: list_find_times.append(result_find["times"]) else: list_find_times.append(0) matrix.append(list_find_times) write.matrix(path_file=pre_path + 'analysis_all_group.txt', list_data=matrix)
def create_data(pre_path): print('---------create_data_get_label_for_groups---------') list_name_file = read.name_file_in_folder(path_folfer=pre_path + 'groups_in') list_group_label = read.get_list_obj_in_file_json(path_file=pre_path + 'group_label.json') #print(list_name_file) count_num_lines_no_guess = 0 for name_file in list_name_file: list_data = read.elements_in_each_line_of_file( path=pre_path + 'groups_in/' + name_file) new_file = [] for line in list_data: new_line = [] for group in line: label = lib.find_obj_by_field(L=list_group_label, target=group, field="group_id") if label: new_line.append(label["label"]) new_line.sort() if len(new_line) > 0: new_file.append(new_line) else: count_num_lines_no_guess = count_num_lines_no_guess + 1 write.w_space_w_in_line(path_file=pre_path + 'get_label_for_groups/' + name_file, list_data=new_file) print("Quantity lines no guess in " + name_file + ": " + str(count_num_lines_no_guess))
def create_file_all_group(pre_path): print('---------create_file_all_group---------') list_file_name = read.name_file_in_folder(path_folfer=pre_path + 'divide_groups_in_label') list_group = [] for name in list_file_name: data_in_file = read.line_in_file(path_file=pre_path + 'divide_groups_in_label/' + name) for line in data_in_file: obj = json.loads(line) list_group.append(obj["group_id"]) list_group.sort() s_list_group = lib.distinct(list_group) write.each_line_by_path(path_file=pre_path + 'all_group.txt', list_data=s_list_group)
def read_list_percent(): list_name_file = read.name_file_in_folder(path_folfer=path_percent_train) list_model = [ '__label__18-24', '__label__25-34', '__label__35-44', '__label__45-54', '__label__55+' ] list_data_train_X = [] list_label_y = [] i = 0 for file_name in list_name_file: data_file = read.matrix_percent(path=path_percent_train + file_name) label = list_model[i] for line in data_file: list_data_train_X.append(line) list_label_y.append(label) i = i + 1 return list_data_train_X, list_label_y
def create_data(pre_path): print('---------create_data_percent---------') list_file_name = read.name_file_in_folder(path_folfer=pre_path + 'get_label_for_groups') list_model = [ "__label__18-24", "__label__25-34", "__label__35-44", "__label__45-54", "__label__55+" ] for file_name in list_file_name: data_in_file = read.elements_in_each_line_of_file( path=pre_path + 'get_label_for_groups/' + file_name) print(file_name) matrix = [] for line in data_in_file: list_analysis = lib.analysis_list_sorted(line) vector = [0] * 5 for i in range(0, 5): vector[i] = 1 / (len(line) + 5) for obj in list_analysis: index = lib.find_index(list_model, obj["label"]) vector[index] = (obj["times"] + 1) / (len(line) + 5) matrix.append(vector) write.matrix(path_file=pre_path + 'percent/' + file_name, list_data=matrix)